NLTK
pip3 install nltk
import nltk
nltk.download("all")
sent_tokenize -> Sentence Tokenizer, splits sentences in a body of text
word_tokenize -> Word Tokenizer, splits words in a sentence
from nltk.tokenize import sent_tokenize, word_tokenize
sampleText = "your text file"
print(sent_tokenize(sampletext))
print(word_tokenize(sampletext))
Grouping words in meaningful grpups
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
txt = "Kids are playing. Kids like to play games. He got played"
custTokenizer = PunktSentenceTokenizer(state_union.raw("2006-GWBush.txt"))
tokenizedText = custTokenizer.tokenize(txt)
for i in tokenizedText:
words = nltk.word_tokenize(i)
tag = nltk.pos_tag(words)
chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tag)
#chunked.draw()
chunked.pretty_print()
stop words are words that dont contribute a lot the text, or filler words like "a", "the"...
this words are removed so that text can be made easier for machine to understand
stopwords.words("english")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
txt = "your text file / input"
words = word_tokenize(txt)
nw = [i for i in words if i not in stopwords.words("english")]
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
# state_union data set will be used for training PunktSentenceTokenizer to create a custom tokenizer
txt = "Your text file or input"
custTokenizer = PunktSentenceTokenizer(state_union.raw("2006-GWBush.txt"))
tokenizedText = custTokenizer.tokenize(txt)
for i in tokenizedText:
words = nltk.word_tokenize(i)
tag = nltk.pos_tag(words)
nameEnt = nltk.ne_chunk(tag)
nameEnt.pretty_print()
Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma.
ps.stem("word")
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
txt = "Kids are playing. Kids like to play games. He got played"
wt = word_tokenize(txt)
print([ps.stem(i) for i in wt])
A very similar operation to stemming is called lemmatizing. The major difference between these is, stemming can often create non-existent words, whereas lemmas are actual words.
pos = "a"
⇒ Adjective
pos = "v"
⇒ Verb
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
# state_union data set will be used for training PunktSentenceTokenizer to create a custom tokenizer
txt = "Kids are playing. Kids like to play games. He got played"
custTokenizer = PunktSentenceTokenizer(state_union.raw("2006-GWBush.txt"))
tokenizedText = custTokenizer.tokenize(txt)
for i in tokenizedText:
words = nltk.word_tokenize(i)
tag = nltk.pos_tag(words)
print(tag)
state_union.raw("2006-GWBush.txt")
This is the txt dataset we will be using for training, other are available, you can use your own.
Example output
[('Kids', 'NNS'), ('are', 'VBP'), ('playing', 'VBG'), ('.', '.')]
[('Kids', 'NNS'), ('like', 'IN'), ('to', 'TO'), ('play', 'VB'), ('games', 'NNS'), ('.', '.')]
[('He', 'PRP'), ('got', 'VBD'), ('played', 'JJ')]